{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Author: Antti Kiviaho\n",
    "# Date: 22.2.2023\n",
    "# Notebook for running dataset normalization and integration\n",
    "# Uses the scib integration environment and pipeline:\n",
    "#\n",
    "# 1. Cell and gene QC filtering\n",
    "# 2. scran normalization through scib\n",
    "# 3. batch-aware scaling with scib (implemented in single-cell-scvi-integrate.py)\n",
    "# 4. batch-aware HVGs with scib (implemented in single-cell-scvi-integrate.py)\n",
    "# 5. scvi-integration to find a shared latent space"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.chdir('/lustre/scratch/kiviaho/prostate_spatial/')\n",
    "import numpy as np\n",
    "import anndata as ad\n",
    "import scanpy as sc\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import scib\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy import sparse\n",
    "from pathlib import Path\n",
    "from scripts.utils import load_from_pickle, save_to_pickle\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "datasets = ['dong_2020','chen_2021','cheng_2022','chen_2022','song_2022','wong_2022','hirz_2023']\n",
    "adata_dict = {}\n",
    "for dataset_id in datasets:\n",
    "    adata = sc.read_h5ad('./sc-reference/'+dataset_id+'/adata_obj.h5ad')\n",
    "    adata_dict[dataset_id] = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def qc_filters(adata, remove_doublets=True):\n",
    "    # requires scib-pipline-R4.0 conda environment !\n",
    "    # import scib\n",
    "    # Filter out cells by using a hybrid of the original publications thresholds\n",
    "    sc.pp.filter_cells(adata, min_counts=600)\n",
    "    sc.pp.filter_cells(adata, min_genes = 300)\n",
    "    sc.pp.filter_genes(adata, min_counts= 10)\n",
    "    # Leave out cells with > 20% mitochondrial reads\n",
    "    adata = adata[adata.obs.pct_counts_mt < 20, :]\n",
    "    if remove_doublets:\n",
    "        sc.external.pp.scrublet(adata)\n",
    "        adata = adata[adata.obs['predicted_doublet']==False]\n",
    "    \n",
    "\n",
    "    return adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for dset in datasets:\n",
    "    adata = adata_dict[dset].copy()\n",
    "    if not sparse.issparse(adata.X):\n",
    "        adata.X = sparse.csr_matrix(adata.X)\n",
    "    \n",
    "    adata = qc_filters(adata)\n",
    "        \n",
    "    scib.preprocessing.normalize(adata,precluster=False, sparsify=False)\n",
    "    # add ids to the data for use after data concatenation\n",
    "    adata.obs['dataset'] = dset\n",
    "    adata_dict[dset] = adata\n",
    "    del adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_to_pickle(adata_dict,'./sc-reference/normalized_sc_7_datasets.pickle')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.12"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
